MaxPoolFusion ================= 对输入张量执行最大池化操作 .. math:: \text{output}_{b, h_o, w_o, c} = \operatorname{clip}\Bigg( \max_{h_i, w_i \in \mathcal{W}(h_o, w_o)} \Big( \text{input}_{b,\; h_i,\; w_i,\; c} \Big),\; \text{minf},\; \text{maxf} \Bigg) 其中,窗口区域 :math:`\mathcal{W}(h_o, w_o)` 的定义如下: .. math:: h_i = h_o \cdot \text{stride}_h - \text{pad}_u + \Delta h w_i = w_o \cdot \text{stride}_w - \text{pad}_l + \Delta w \Delta h \in [0,\ \text{win}_h - 1],\quad \Delta w \in [0,\ \text{win}_w - 1] 有效窗口点满足: .. math:: 0 \le h_i < \text{in}_h,\qquad 0 \le w_i < \text{in}_w 原始窗口起点定义为: .. math:: h_{\text{start}} = h_o \cdot \text{stride}_h - \text{pad}_u .. math:: w_{\text{start}} = w_o \cdot \text{stride}_w - \text{pad}_l 合法采样范围为: .. math:: \Delta h \in \Big[ \max(0,\ -h_{\text{start}}),\; \min(\text{win}_h,\ \text{in}_h - h_{\text{start}}) \Big) .. math:: \Delta w \in \Big[ \max(0,\ -w_{\text{start}}),\; \min(\text{win}_w,\ \text{in}_w - w_{\text{start}}) \Big) 最大池化计算: .. math:: v_{\max} = \max_{\Delta h,\ \Delta w}\; \text{input}_{b,\; h_{\text{start}} + \Delta h,\; w_{\text{start}} + \Delta w,\; c} 最终输出: .. math:: \text{output}_{b, h_o, w_o, c} = \min\big(\max(v_{\max},\ \text{minf}),\ \text{maxf}\big) 输入: - **input** - 输入张量指针,采用 **NHWC 格式**,形状为 :math:`[batch,\ in\_h,\ in\_w,\ channel]` - **in_w** - 输入张量的宽度 (W) - **in_h** - 输入张量的高度 (H) - **win_w** - 池化窗口的宽度,即窗口在 W 方向的大小 - **win_h** - 池化窗口的高度,即窗口在 H 方向的大小 - **output_w** - 输出特征图的宽度 - **output_h** - 输出特征图的高度 - **batch** - 批次大小,即输入中的 batch 数 - **channel** - 通道数 C ,每个池化位置都分别对 C 个通道独立执行最大池化与裁剪 - **stride_w** - 池化窗口在 W 方向的步长 - **stride_h** - 池化窗口在 H 方向的步长 - **pad_l** - 输入特征图左侧的填充大小 - **pad_u** - 输入特征图上侧的填充大小 - **minf** - 输出结果的下界值。池化结果会执行 :math:`\max(v,\ \text{minf})` - **maxf** - 输出结果的上界值。池化结果会执行 :math:`\min(v,\ \text{maxf})` - **core_mask** - 核心掩码,指定使用的计算核心 输出: - **output** - 输出张量指针,采用 **NHWC 格式**,形状为 :math:`[batch,\ output\_h,\ output\_w,\ channel]`。 支持平台: ``FT78NE`` ``MT7004`` .. note:: - FT78NE 支持fp32, fp64 - MT7004 支持fp16, fp32 - 调用时将除 core_mask 外的参数打包通过 long long params 数组传入,顺序为: input, output, in_w, in_h, win_w, win_h, output_w, output_h, batch, channel, stride_w, stride_h, pad_l, pad_u, minf, maxf **共享存储版本:** .. c:function:: void hp_maxpool_fusion_s(long long *params, int core_mask) .. c:function:: void fp_maxpool_fusion_s(long long *params, int core_mask) .. c:function:: void dp_maxpool_fusion_s(long long *params, int core_mask) **C调用示例:** .. code-block:: c :linenos: :emphasize-lines: 45 //FT78NE示例 #include int main(int argc, char* argv[]) { float* input_ptr = (float*)0xA0000000; float* output_ptr = (float*)0xB0000000; float* check_ptr = (float*)0xC0000000; int in_w = 32; int in_h = 32; int win_w = 6; int win_h = 6; int batch = 4; int channel = 2; int stride_w = 4; int stride_h = 4; int pad_l = 0; int pad_u = 0; float minf = 0.0f; float maxf = 50.0f; // 根据标准公式计算输出尺寸 int dividor = in_w + pad_l * 2 - win_w; int output_w = (dividor + stride_w - 1) / stride_w + 1; int dividor2 = in_h + pad_u * 2 - win_h; int output_h = (dividor2 + stride_h - 1) / stride_h + 1; long long params[16]; params[0] = (long long)input_ptr; params[1] = (long long)output_ptr; params[2] = (long long)in_w; params[3] = (long long)in_h; params[4] = (long long)win_w; params[5] = (long long)win_h; params[6] = (long long)output_w; params[7] = (long long)output_h; params[8] = (long long)batch; params[9] = (long long)channel; params[10] = (long long)stride_w; params[11] = (long long)stride_h; params[12] = (long long)pad_l; params[13] = (long long)pad_u; params[14] = (long long)&minf; params[15] = (long long)&maxf; int core_mask = 0x0f; fp_maxpool_fusion_s(params, core_mask); return 0; } **私有存储版本:** .. c:function:: void hp_maxpool_fusion_p(long long *params) .. c:function:: void fp_maxpool_fusion_p(long long *params) .. c:function:: void dp_maxpool_fusion_p(long long *params) **C调用示例:** .. code-block:: c :linenos: :emphasize-lines: 44 //FT78NE示例 #include int main(int argc, char* argv[]) { float* input_ptr = (float*)0xA0000000; float* output_ptr = (float*)0xB0000000; float* check_ptr = (float*)0xC0000000; int in_w = 32; int in_h = 32; int win_w = 6; int win_h = 6; int batch = 4; int channel = 2; int stride_w = 4; int stride_h = 4; int pad_l = 0; int pad_u = 0; float minf = 0.0f; float maxf = 50.0f; // 根据标准公式计算输出尺寸 int dividor = in_w + pad_l * 2 - win_w; int output_w = (dividor + stride_w - 1) / stride_w + 1; int dividor2 = in_h + pad_u * 2 - win_h; int output_h = (dividor2 + stride_h - 1) / stride_h + 1; long long params[16]; params[0] = (long long)input_ptr; params[1] = (long long)output_ptr; params[2] = (long long)in_w; params[3] = (long long)in_h; params[4] = (long long)win_w; params[5] = (long long)win_h; params[6] = (long long)output_w; params[7] = (long long)output_h; params[8] = (long long)batch; params[9] = (long long)channel; params[10] = (long long)stride_w; params[11] = (long long)stride_h; params[12] = (long long)pad_l; params[13] = (long long)pad_u; params[14] = (long long)&minf; params[15] = (long long)&maxf; fp_maxpool_fusion_p(params); return 0; }